In Class Exercise 07

Author

Henry Low

Published

October 14, 2024

Modified

October 17, 2024

Data Sources

(saved under ‘data’ folder)
From data.gov.sg:
- Master Plan 2014 Subzone Boundary (Web) (SHP)

Others:
condo_resale_2015 in csv format (i.e. condo_resale_2015.csv)

Setting Up

Load Packages

  • sf package to perform geospatial wrangling tasks 
  • corrplot package for multivariate data visualisation and analysis 
  • tidyverse package for reading csv files, dataframe processing tasks 
  • ggpubr package to assist in plotting 
  • olsrr package for building OLS and performing diagnostics tests 
  • GWModel package for calibrating geographical weighted family of models 
  • tmap package for plotting tasks 
pacman::p_load(olsrr, GWmodel, 
               # performance, corrplot, gtsummary, see # Not used
               sfdep, 
               tidyverse, sf,
               ggstatsplot, tmap, ggpubr)

Import geospatial data

Lets import the geospatial data, remembering to do the projection to 3414.

# Import geospatial data
mpsz = st_read(dsn = "data/geospatial", layer = "MP14_SUBZONE_WEB_PL") %>%
  st_transform(3414)
Reading layer `MP14_SUBZONE_WEB_PL' from data source 
  `C:\Users\Henry\Desktop\SMU Masters\2024-2025 T1\Geospatial Analytics & Applications\Project\GeospatialWebsite\In-Class_Ex\In-Class_Ex_07\data\geospatial' 
  using driver `ESRI Shapefile'
Simple feature collection with 323 features and 15 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
Projected CRS: SVY21

Importing Aspatial data

Lets import the aspatial data and check it.

# Import aspatial data
condo_resale = read_csv("data/aspatial/Condo_resale_2015.csv")
Rows: 1436 Columns: 23
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (23): LATITUDE, LONGITUDE, POSTCODE, SELLING_PRICE, AREA_SQM, AGE, PROX_...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Check imported data
glimpse(condo_resale)
Rows: 1,436
Columns: 23
$ LATITUDE             <dbl> 1.287145, 1.328698, 1.313727, 1.308563, 1.321437,…
$ LONGITUDE            <dbl> 103.7802, 103.8123, 103.7971, 103.8247, 103.9505,…
$ POSTCODE             <dbl> 118635, 288420, 267833, 258380, 467169, 466472, 3…
$ SELLING_PRICE        <dbl> 3000000, 3880000, 3325000, 4250000, 1400000, 1320…
$ AREA_SQM             <dbl> 309, 290, 248, 127, 145, 139, 218, 141, 165, 168,…
$ AGE                  <dbl> 30, 32, 33, 7, 28, 22, 24, 24, 27, 31, 17, 22, 6,…
$ PROX_CBD             <dbl> 7.941259, 6.609797, 6.898000, 4.038861, 11.783402…
$ PROX_CHILDCARE       <dbl> 0.16597932, 0.28027246, 0.42922669, 0.39473543, 0…
$ PROX_ELDERLYCARE     <dbl> 2.5198118, 1.9333338, 0.5021395, 1.9910316, 1.121…
$ PROX_URA_GROWTH_AREA <dbl> 6.618741, 7.505109, 6.463887, 4.906512, 6.410632,…
$ PROX_HAWKER_MARKET   <dbl> 1.76542207, 0.54507614, 0.37789301, 1.68259969, 0…
$ PROX_KINDERGARTEN    <dbl> 0.05835552, 0.61592412, 0.14120309, 0.38200076, 0…
$ PROX_MRT             <dbl> 0.5607188, 0.6584461, 0.3053433, 0.6910183, 0.528…
$ PROX_PARK            <dbl> 1.1710446, 0.1992269, 0.2779886, 0.9832843, 0.116…
$ PROX_PRIMARY_SCH     <dbl> 1.6340256, 0.9747834, 1.4715016, 1.4546324, 0.709…
$ PROX_TOP_PRIMARY_SCH <dbl> 3.3273195, 0.9747834, 1.4715016, 2.3006394, 0.709…
$ PROX_SHOPPING_MALL   <dbl> 2.2102717, 2.9374279, 1.2256850, 0.3525671, 1.307…
$ PROX_SUPERMARKET     <dbl> 0.9103958, 0.5900617, 0.4135583, 0.4162219, 0.581…
$ PROX_BUS_STOP        <dbl> 0.10336166, 0.28673408, 0.28504777, 0.29872340, 0…
$ NO_Of_UNITS          <dbl> 18, 20, 27, 30, 30, 31, 32, 32, 32, 32, 34, 34, 3…
$ FAMILY_FRIENDLY      <dbl> 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0…
$ FREEHOLD             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1…
$ LEASEHOLD_99YR       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

Since the file is in csv, there is no metadata to indicate what the CRS is. Therefore, I need to assign a CRS first (4326) before doing the projection. The “4326” is best guess estimate and in practice I would have to plot to see if the coordinates fall within the expected plot to verify.

# Convert tibble to sf 
condo_resale.sf <- st_as_sf(condo_resale,
                            coords = c("LONGITUDE", "LATITUDE"),
                            crs=4326) %>%
  st_transform(crs=3414)

# Check result
head(condo_resale.sf)
Simple feature collection with 6 features and 21 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: 22085.12 ymin: 29951.54 xmax: 41042.56 ymax: 34546.2
Projected CRS: SVY21 / Singapore TM
# A tibble: 6 × 22
  POSTCODE SELLING_PRICE AREA_SQM   AGE PROX_CBD PROX_CHILDCARE PROX_ELDERLYCARE
     <dbl>         <dbl>    <dbl> <dbl>    <dbl>          <dbl>            <dbl>
1   118635       3000000      309    30     7.94          0.166            2.52 
2   288420       3880000      290    32     6.61          0.280            1.93 
3   267833       3325000      248    33     6.90          0.429            0.502
4   258380       4250000      127     7     4.04          0.395            1.99 
5   467169       1400000      145    28    11.8           0.119            1.12 
6   466472       1320000      139    22    10.3           0.125            0.789
# ℹ 15 more variables: PROX_URA_GROWTH_AREA <dbl>, PROX_HAWKER_MARKET <dbl>,
#   PROX_KINDERGARTEN <dbl>, PROX_MRT <dbl>, PROX_PARK <dbl>,
#   PROX_PRIMARY_SCH <dbl>, PROX_TOP_PRIMARY_SCH <dbl>,
#   PROX_SHOPPING_MALL <dbl>, PROX_SUPERMARKET <dbl>, PROX_BUS_STOP <dbl>,
#   NO_Of_UNITS <dbl>, FAMILY_FRIENDLY <dbl>, FREEHOLD <dbl>,
#   LEASEHOLD_99YR <dbl>, geometry <POINT [m]>

Save processed datasets

Now once basic data processing is completed, I will proceed to save it in rds format for easy retrieval.

write_rds(condo_resale, "data/processed/condo_resale.rds")
write_rds(condo_resale.sf, "data/processed/condo_resale.sf.rds")
write_rds(mpsz, "data/processed/mpsz.rds")

Exploratory Data Analysis (EDA)

First, I’ll clear the R cache and load in the datasets

# Clear R console
rm(list = ls(all.names = TRUE))

# Load datasets
condo_resale <- read_rds("data/processed/condo_resale.rds")
condo_resale.sf <- read_rds("data/processed/condo_resale.sf.rds")
mpsz <- read_rds("data/processed/mpsz.rds")

ggcorrmat() from ggstatsplot package is used to build the correlation matrix

# Build correlation matrix
ggcorrmat(condo_resale[, 5:23])

Dont need to worry about VIF for K-means, but it is important in multiple linear regression.

From this, we can see that there is very high correlation between LEASEHOLD_99YR and FREEHOLD. But does this matter? We will find later.

Building Multiple Linear Regression Model

lm() is used to create the multiple linear regression model. After that, summary() is used to check the results

# Build multiple lienar regression model
condo.mlr <- lm(formula = SELLING_PRICE ~ AREA_SQM + AGE    + 
                  PROX_CBD + PROX_CHILDCARE + PROX_ELDERLYCARE +
                  PROX_URA_GROWTH_AREA + PROX_HAWKER_MARKET + PROX_KINDERGARTEN + 
                  PROX_MRT  + PROX_PARK + PROX_PRIMARY_SCH + 
                  PROX_TOP_PRIMARY_SCH + PROX_SHOPPING_MALL + PROX_SUPERMARKET + 
                  PROX_BUS_STOP + NO_Of_UNITS + FAMILY_FRIENDLY + FREEHOLD + LEASEHOLD_99YR,  # Add in LEASEHOLD_99YR to test VIF later
                data=condo_resale.sf)

# Check results
summary(condo.mlr)

Call:
lm(formula = SELLING_PRICE ~ AREA_SQM + AGE + PROX_CBD + PROX_CHILDCARE + 
    PROX_ELDERLYCARE + PROX_URA_GROWTH_AREA + PROX_HAWKER_MARKET + 
    PROX_KINDERGARTEN + PROX_MRT + PROX_PARK + PROX_PRIMARY_SCH + 
    PROX_TOP_PRIMARY_SCH + PROX_SHOPPING_MALL + PROX_SUPERMARKET + 
    PROX_BUS_STOP + NO_Of_UNITS + FAMILY_FRIENDLY + FREEHOLD + 
    LEASEHOLD_99YR, data = condo_resale.sf)

Residuals:
     Min       1Q   Median       3Q      Max 
-3471036  -286903   -22426   239412 12254549 

Coefficients:
                      Estimate Std. Error t value Pr(>|t|)    
(Intercept)           543071.4   136210.9   3.987 7.03e-05 ***
AREA_SQM               12688.7      370.1  34.283  < 2e-16 ***
AGE                   -24566.0     2766.0  -8.881  < 2e-16 ***
PROX_CBD              -78122.0     6791.4 -11.503  < 2e-16 ***
PROX_CHILDCARE       -333219.0   111020.3  -3.001 0.002734 ** 
PROX_ELDERLYCARE      170950.0    42110.8   4.060 5.19e-05 ***
PROX_URA_GROWTH_AREA   38507.6    12523.7   3.075 0.002147 ** 
PROX_HAWKER_MARKET     23801.2    29299.9   0.812 0.416739    
PROX_KINDERGARTEN     144098.0    82738.7   1.742 0.081795 .  
PROX_MRT             -322775.9    58528.1  -5.515 4.14e-08 ***
PROX_PARK             564487.9    66563.0   8.481  < 2e-16 ***
PROX_PRIMARY_SCH      186170.5    65515.2   2.842 0.004553 ** 
PROX_TOP_PRIMARY_SCH    -477.1    20598.0  -0.023 0.981525    
PROX_SHOPPING_MALL   -207721.5    42855.5  -4.847 1.39e-06 ***
PROX_SUPERMARKET      -48074.7    77145.3  -0.623 0.533273    
PROX_BUS_STOP         675755.0   138552.0   4.877 1.20e-06 ***
NO_Of_UNITS             -216.2       90.3  -2.394 0.016797 *  
FAMILY_FRIENDLY       142128.3    47055.1   3.020 0.002569 ** 
FREEHOLD              300646.5    77296.5   3.890 0.000105 ***
LEASEHOLD_99YR        -77137.4    77570.9  -0.994 0.320192    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 755800 on 1416 degrees of freedom
Multiple R-squared:  0.652, Adjusted R-squared:  0.6474 
F-statistic: 139.6 on 19 and 1416 DF,  p-value: < 2.2e-16

Output is an LM object with various important information - residual: to be used later in geospatial - fitted value: estimated price

Model Assessment (Olsrr)

In this section, we will make use of olsrr(). It has a variety of useful methods to build better models.

Tidy Report

First, ols_regress() will be used to print out a nice report.

# Print report
ols_regress(condo.mlr)
                                Model Summary                                 
-----------------------------------------------------------------------------
R                            0.807       RMSE                     750537.537 
R-Squared                    0.652       MSE                571262902261.223 
Adj. R-Squared               0.647       Coef. Var                    43.160 
Pred R-Squared               0.637       AIC                       42971.173 
MAE                     412117.987       SBC                       43081.835 
-----------------------------------------------------------------------------
 RMSE: Root Mean Square Error 
 MSE: Mean Square Error 
 MAE: Mean Absolute Error 
 AIC: Akaike Information Criteria 
 SBC: Schwarz Bayesian Criteria 

                                     ANOVA                                       
--------------------------------------------------------------------------------
                    Sum of                                                      
                   Squares          DF         Mean Square       F         Sig. 
--------------------------------------------------------------------------------
Regression    1.515738e+15          19        7.977571e+13    139.648    0.0000 
Residual      8.089083e+14        1416    571262902261.223                      
Total         2.324647e+15        1435                                          
--------------------------------------------------------------------------------

                                               Parameter Estimates                                                
-----------------------------------------------------------------------------------------------------------------
               model           Beta    Std. Error    Std. Beta       t        Sig           lower          upper 
-----------------------------------------------------------------------------------------------------------------
         (Intercept)     543071.420    136210.918                   3.987    0.000     275874.535     810268.305 
            AREA_SQM      12688.669       370.119        0.579     34.283    0.000      11962.627      13414.710 
                 AGE     -24566.001      2766.041       -0.166     -8.881    0.000     -29991.980     -19140.022 
            PROX_CBD     -78121.985      6791.377       -0.267    -11.503    0.000     -91444.227     -64799.744 
      PROX_CHILDCARE    -333219.036    111020.303       -0.087     -3.001    0.003    -551000.984    -115437.089 
    PROX_ELDERLYCARE     170949.961     42110.748        0.083      4.060    0.000      88343.803     253556.120 
PROX_URA_GROWTH_AREA      38507.622     12523.661        0.059      3.075    0.002      13940.700      63074.545 
  PROX_HAWKER_MARKET      23801.197     29299.923        0.019      0.812    0.417     -33674.725      81277.120 
   PROX_KINDERGARTEN     144097.972     82738.669        0.030      1.742    0.082     -18205.570     306401.514 
            PROX_MRT    -322775.874     58528.079       -0.123     -5.515    0.000    -437586.937    -207964.811 
           PROX_PARK     564487.876     66563.011        0.148      8.481    0.000     433915.162     695060.590 
    PROX_PRIMARY_SCH     186170.524     65515.193        0.072      2.842    0.005      57653.253     314687.795 
PROX_TOP_PRIMARY_SCH       -477.073     20597.972       -0.001     -0.023    0.982     -40882.894      39928.747 
  PROX_SHOPPING_MALL    -207721.520     42855.500       -0.109     -4.847    0.000    -291788.613    -123654.427 
    PROX_SUPERMARKET     -48074.679     77145.257       -0.012     -0.623    0.533    -199405.956     103256.599 
       PROX_BUS_STOP     675755.044    138551.991        0.133      4.877    0.000     403965.817     947544.272 
         NO_Of_UNITS       -216.180        90.302       -0.046     -2.394    0.017       -393.320        -39.040 
     FAMILY_FRIENDLY     142128.272     47055.082        0.056      3.020    0.003      49823.107     234433.438 
            FREEHOLD     300646.543     77296.529        0.117      3.890    0.000     149018.525     452274.561 
      LEASEHOLD_99YR     -77137.375     77570.869       -0.030     -0.994    0.320    -229303.551      75028.801 
-----------------------------------------------------------------------------------------------------------------

From the report, it is clear that not all variables are statistically significant.

Multicolinearity Test

ols_vif_tol() is then used to check for multicolinearity.

# Multicolinearity Test
ols_vif_tol(condo.mlr)
              Variables Tolerance      VIF
1              AREA_SQM 0.8601326 1.162611
2                   AGE 0.7011585 1.426211
3              PROX_CBD 0.4575471 2.185567
4        PROX_CHILDCARE 0.2898233 3.450378
5      PROX_ELDERLYCARE 0.5922238 1.688551
6  PROX_URA_GROWTH_AREA 0.6614081 1.511926
7    PROX_HAWKER_MARKET 0.4373874 2.286303
8     PROX_KINDERGARTEN 0.8356793 1.196631
9              PROX_MRT 0.4949877 2.020252
10            PROX_PARK 0.8015728 1.247547
11     PROX_PRIMARY_SCH 0.3823248 2.615577
12 PROX_TOP_PRIMARY_SCH 0.4878620 2.049760
13   PROX_SHOPPING_MALL 0.4903052 2.039546
14     PROX_SUPERMARKET 0.6142127 1.628100
15        PROX_BUS_STOP 0.3311024 3.020213
16          NO_Of_UNITS 0.6543336 1.528272
17      FAMILY_FRIENDLY 0.7191719 1.390488
18             FREEHOLD 0.2728521 3.664990
19       LEASEHOLD_99YR 0.2645988 3.779307

Even though FREEHOLD and LEASEHOLD_99YR is highly correlated, since VIF shows that they are <5, we dont have to remove either one.

Variable Selection

For variable selection, olsrr provides a few options

  1. Forward stepwise - Include 1 by 1.
  2. Backward stepwise - Remove 1 by 1/
  3. Mix stepwise - take out with replacement

Normally we use forward_p since we want to focus on the variable being statistically significant.

# Forward stepwise regression
condo_fw_mlr <- ols_step_forward_p(
  condo.mlr,
  p_val = 0.05,
  details = FALSE # True if want to see the output each step
)

Checking the results, I can just print the output.

condo_fw_mlr

                                     Stepwise Summary                                      
-----------------------------------------------------------------------------------------
Step    Variable                   AIC          SBC         SBIC         R2       Adj. R2 
-----------------------------------------------------------------------------------------
 0      Base Model              44449.068    44459.608    40371.745    0.00000    0.00000 
 1      AREA_SQM                43587.753    43603.562    39510.883    0.45184    0.45146 
 2      PROX_CBD                43243.523    43264.602    39167.182    0.56928    0.56868 
 3      PROX_PARK               43177.691    43204.039    39101.331    0.58915    0.58829 
 4      FREEHOLD                43125.474    43157.092    39049.179    0.60438    0.60327 
 5      AGE                     43069.222    43106.109    38993.167    0.62010    0.61878 
 6      PROX_ELDERLYCARE        43046.515    43088.672    38970.548    0.62659    0.62502 
 7      PROX_SHOPPING_MALL      43020.990    43068.417    38945.209    0.63367    0.63188 
 8      PROX_URA_GROWTH_AREA    43009.092    43061.788    38933.407    0.63720    0.63517 
 9      PROX_MRT                42999.058    43057.024    38923.483    0.64023    0.63796 
 10     PROX_BUS_STOP           42984.951    43048.186    38909.582    0.64424    0.64175 
 11     FAMILY_FRIENDLY         42981.085    43049.590    38905.797    0.64569    0.64296 
 12     NO_Of_UNITS             42975.246    43049.021    38900.092    0.64762    0.64465 
 13     PROX_CHILDCARE          42971.858    43050.902    38896.812    0.64894    0.64573 
 14     PROX_PRIMARY_SCH        42966.758    43051.072    38891.872    0.65067    0.64723 
-----------------------------------------------------------------------------------------

Final Model Output 
------------------

                                Model Summary                                 
-----------------------------------------------------------------------------
R                            0.807       RMSE                     751998.679 
R-Squared                    0.651       MSE                571471422208.591 
Adj. R-Squared               0.647       Coef. Var                    43.168 
Pred R-Squared               0.638       AIC                       42966.758 
MAE                     414819.628       SBC                       43051.072 
-----------------------------------------------------------------------------
 RMSE: Root Mean Square Error 
 MSE: Mean Square Error 
 MAE: Mean Absolute Error 
 AIC: Akaike Information Criteria 
 SBC: Schwarz Bayesian Criteria 

                                     ANOVA                                       
--------------------------------------------------------------------------------
                    Sum of                                                      
                   Squares          DF         Mean Square       F         Sig. 
--------------------------------------------------------------------------------
Regression    1.512586e+15          14        1.080418e+14    189.059    0.0000 
Residual      8.120609e+14        1421    571471422208.591                      
Total         2.324647e+15        1435                                          
--------------------------------------------------------------------------------

                                               Parameter Estimates                                                
-----------------------------------------------------------------------------------------------------------------
               model           Beta    Std. Error    Std. Beta       t        Sig           lower          upper 
-----------------------------------------------------------------------------------------------------------------
         (Intercept)     527633.222    108183.223                   4.877    0.000     315417.244     739849.200 
            AREA_SQM      12777.523       367.479        0.584     34.771    0.000      12056.663      13498.382 
            PROX_CBD     -77131.323      5763.125       -0.263    -13.384    0.000     -88436.469     -65826.176 
           PROX_PARK     570504.807     65507.029        0.150      8.709    0.000     442003.938     699005.677 
            FREEHOLD     350599.812     48506.485        0.136      7.228    0.000     255447.802     445751.821 
                 AGE     -24687.739      2754.845       -0.167     -8.962    0.000     -30091.739     -19283.740 
    PROX_ELDERLYCARE     185575.623     39901.864        0.090      4.651    0.000     107302.737     263848.510 
  PROX_SHOPPING_MALL    -220947.251     36561.832       -0.115     -6.043    0.000    -292668.213    -149226.288 
PROX_URA_GROWTH_AREA      39163.254     11754.829        0.060      3.332    0.001      16104.571      62221.936 
            PROX_MRT    -294745.107     56916.367       -0.112     -5.179    0.000    -406394.234    -183095.980 
       PROX_BUS_STOP     682482.221    134513.243        0.134      5.074    0.000     418616.359     946348.082 
     FAMILY_FRIENDLY     146307.576     46893.021        0.057      3.120    0.002      54320.593     238294.560 
         NO_Of_UNITS       -245.480        87.947       -0.053     -2.791    0.005       -418.000        -72.961 
      PROX_CHILDCARE    -318472.751    107959.512       -0.084     -2.950    0.003    -530249.889    -106695.613 
    PROX_PRIMARY_SCH     159856.136     60234.599        0.062      2.654    0.008      41697.849     278014.424 
-----------------------------------------------------------------------------------------------------------------

The plot() function can be used to see how the statistical results change with each step.

plot(condo_fw_mlr)

Model parameters can also be visualised with ggcoefstats()

# Plot coefficient statistics
ggcoefstats(condo_fw_mlr$model, sort = "ascending")
Number of labels is greater than default palette color count.
• Select another color `palette` (and/or `package`).

Non-Linearity Test

In the context of multiple linear regression, it is important to test model created to ensure that the assumption that linearity and additivity of the relationship between dependent and independent variables hold.

ols_plot_resid_fit() from olsrr package is used to visualise the relationship between residual and fitted values

# Plot residual vs fitted values
ols_plot_resid_fit(condo_fw_mlr$model)

Since majority of the point hover around the red line, it means that the model conforms to the linearity assumption.

Normality Assumption Test

ols_plot_resid_hist() is then used to test the normality assumption.

# Normality assumption test visualization
ols_plot_resid_hist(condo_fw_mlr$model)

This shows that the residual resembles a normal distribution. Another way of doing this test is using ols_test-normality().

# Alternative normality assumption test
ols_test_normality(condo_fw_mlr$model)
Warning in ks.test.default(y, "pnorm", mean(y), sd(y)): ties should not be
present for the one-sample Kolmogorov-Smirnov test
-----------------------------------------------
       Test             Statistic       pvalue  
-----------------------------------------------
Shapiro-Wilk              0.6856         0.0000 
Kolmogorov-Smirnov        0.1366         0.0000 
Cramer-von Mises         121.0768        0.0000 
Anderson-Darling         67.9551         0.0000 
-----------------------------------------------

Since the summary table shows the p-values of all tests being smaller than alpha value of 0.05, I can reject the null hypothesis and infer that there is statistical evidence that residuals are not normally distributed.

Spatial Autocorrelation Test

Since the context of hedonic modelling is for geospatial analysis, we need to visualize the residual of the model

First, I will export the residual and save it as a dataframe.

# Export residual
mlr_output <- as.data.frame(condo_fw_mlr$model$residuals) %>%
  rename(`FW_MLR_RES` = `condo_fw_mlr$model$residuals`)

Then I’ll join the dataframe with condo_resale_sf object.

# Join residual dataframe with sf dataframe
condo_resale_sf <- cbind(condo_resale.sf, mlr_output$FW_MLR_RES) %>%
  rename(`MLR_RES` = `mlr_output.FW_MLR_RES`)

Finally, i will use tmap to display the distribution of the residuals on an interactive map.

# Set to interactive
tmap_mode("view")
tmap mode set to interactive viewing
# Visualzize point symbol map
tm_shape(mpsz)+
  tmap_options(check.and.fix = TRUE) +
  tm_polygons(alpha = 0.4) +
tm_shape(condo_resale_sf) + 
  tm_dots(col = "MLR_RES",
          alpha = 0.6,
          style = "quantile") 
Warning: The shape mpsz is invalid (after reprojection). See sf::st_is_valid
Variable(s) "MLR_RES" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
# Check tmap mode back to plot
tmap_mode("plot")
tmap mode set to plotting

The map looks like there are some spatial autocorrelation. More investigation needs to be done.

Spatial Stationary Test

To prove that there are indeed spatial autocorrelation, Moran’s I test will be used. Spatial weights are created with sfdep package with st_weights() on the neighbour list created with st_knn(). Longlat is set to be FALSE because we dont want it to do the projection.

# Create spatial weights
condo_resale_sf <- condo_resale_sf %>%
  mutate(nb = st_knn(geometry, k = 6, longlat = FALSE),
         wt = st_weights(nb, style = "W"),
         .before = 1)

global_moran_perm() from sfdep package will then be used to perform the global Moran’s I test.

# Set seed
set.seed(42)

# Perform Global Moran's I test
global_moran <- global_moran_perm(condo_resale_sf$MLR_RES,
                  condo_resale_sf$nb,
                  condo_resale_sf$wt,
                  alternative = "two.sided",
                  nsim = 99)
global_moran

    Monte-Carlo simulation of Moran I

data:  x 
weights: listw  
number of simulations + 1: 100 

statistic = 0.32254, observed rank = 100, p-value < 2.2e-16
alternative hypothesis: two.sided

Since the observed Global Moran’s I test shows that the p-value is less than the alpha value of 0.05, I will reject the null hypothesis that the residuals are randomly distributed. With a observed global Moran I test statistic of 0.32254, which is greater than 0, I can infer that the residuals resemble cluster distribution.

Building Hedonic Pricing Models (GWmodel)

In this section, I will model hedonic pricing with a geographically weighted regression model.

Build Fixed Bandwith GWR Model

bw.gwr() from GWmodel package is used to determine the optiaml fixed bandwidth for the model. While there are 2 ways to determine the stopping rule:

  • CV - cross-validatiation approach
  • AIC - AIC corrected (AICc) approach

I will use CV method.

# Calculate optimal fixed bandwidth
bw_fixed <- bw.gwr(formula = SELLING_PRICE ~ AREA_SQM + AGE + 
                     PROX_CBD + PROX_CHILDCARE + 
                     PROX_ELDERLYCARE + PROX_URA_GROWTH_AREA + 
                     PROX_MRT + PROX_PARK + PROX_PRIMARY_SCH + 
                     PROX_SHOPPING_MALL + PROX_BUS_STOP + 
                     NO_Of_UNITS + FAMILY_FRIENDLY + FREEHOLD,
                   data = condo_resale_sf,
                   approach = "CV",
                   kernel="gaussian",
                   adaptive = FALSE,
                   longlat=FALSE)
Fixed bandwidth: 17660.96 CV score: 8.259118e+14 
Fixed bandwidth: 10917.26 CV score: 7.970454e+14 
Fixed bandwidth: 6749.419 CV score: 7.273273e+14 
Fixed bandwidth: 4173.553 CV score: 6.300006e+14 
Fixed bandwidth: 2581.58 CV score: 5.404958e+14 
Fixed bandwidth: 1597.687 CV score: 4.857515e+14 
Fixed bandwidth: 989.6077 CV score: 4.722431e+14 
Fixed bandwidth: 613.7939 CV score: 1.378294e+16 
Fixed bandwidth: 1221.873 CV score: 4.778717e+14 
Fixed bandwidth: 846.0596 CV score: 4.791629e+14 
Fixed bandwidth: 1078.325 CV score: 4.751406e+14 
Fixed bandwidth: 934.7772 CV score: 4.72518e+14 
Fixed bandwidth: 1023.495 CV score: 4.730305e+14 
Fixed bandwidth: 968.6643 CV score: 4.721317e+14 
Fixed bandwidth: 955.7206 CV score: 4.722072e+14 
Fixed bandwidth: 976.6639 CV score: 4.721387e+14 
Fixed bandwidth: 963.7202 CV score: 4.721484e+14 
Fixed bandwidth: 971.7199 CV score: 4.721293e+14 
Fixed bandwidth: 973.6083 CV score: 4.721309e+14 
Fixed bandwidth: 970.5527 CV score: 4.721295e+14 
Fixed bandwidth: 972.4412 CV score: 4.721296e+14 
Fixed bandwidth: 971.2741 CV score: 4.721292e+14 
Fixed bandwidth: 970.9985 CV score: 4.721293e+14 
Fixed bandwidth: 971.4443 CV score: 4.721292e+14 
Fixed bandwidth: 971.5496 CV score: 4.721293e+14 
Fixed bandwidth: 971.3793 CV score: 4.721292e+14 
Fixed bandwidth: 971.3391 CV score: 4.721292e+14 
Fixed bandwidth: 971.3143 CV score: 4.721292e+14 
Fixed bandwidth: 971.3545 CV score: 4.721292e+14 
Fixed bandwidth: 971.3296 CV score: 4.721292e+14 
Fixed bandwidth: 971.345 CV score: 4.721292e+14 
Fixed bandwidth: 971.3355 CV score: 4.721292e+14 
Fixed bandwidth: 971.3413 CV score: 4.721292e+14 
Fixed bandwidth: 971.3377 CV score: 4.721292e+14 
Fixed bandwidth: 971.34 CV score: 4.721292e+14 
Fixed bandwidth: 971.3405 CV score: 4.721292e+14 
Fixed bandwidth: 971.3408 CV score: 4.721292e+14 
Fixed bandwidth: 971.3403 CV score: 4.721292e+14 
Fixed bandwidth: 971.3406 CV score: 4.721292e+14 
Fixed bandwidth: 971.3404 CV score: 4.721292e+14 
Fixed bandwidth: 971.3405 CV score: 4.721292e+14 
Fixed bandwidth: 971.3405 CV score: 4.721292e+14 

This shows that the recommended bandwidth is 971.3405 meters. ::: note See st_crs - CS length is in meters :::

Now the fixed bandwidth can be used to calibrate the model with gwr.basic(). The output is saved as a class gwrm. We can check it by printing. It contains a SDF dataframe which has a lot of useful information.

# Calibrate model with fixed bandwidth
gwr_fixed <- gwr.basic(formula = SELLING_PRICE ~ AREA_SQM + AGE + 
                     PROX_CBD + PROX_CHILDCARE + 
                     PROX_ELDERLYCARE + PROX_URA_GROWTH_AREA + 
                     PROX_MRT + PROX_PARK + PROX_PRIMARY_SCH + 
                     PROX_SHOPPING_MALL + PROX_BUS_STOP + 
                     NO_Of_UNITS + FAMILY_FRIENDLY + FREEHOLD,
                   data = condo_resale_sf,
                   bw = bw_fixed,
                   kernel="gaussian",
                   adaptive = FALSE,
                   longlat=FALSE)

# Check results
gwr_fixed
   ***********************************************************************
   *                       Package   GWmodel                             *
   ***********************************************************************
   Program starts at: 2024-10-17 00:03:36.641365 
   Call:
   gwr.basic(formula = SELLING_PRICE ~ AREA_SQM + AGE + PROX_CBD + 
    PROX_CHILDCARE + PROX_ELDERLYCARE + PROX_URA_GROWTH_AREA + 
    PROX_MRT + PROX_PARK + PROX_PRIMARY_SCH + PROX_SHOPPING_MALL + 
    PROX_BUS_STOP + NO_Of_UNITS + FAMILY_FRIENDLY + FREEHOLD, 
    data = condo_resale_sf, bw = bw_fixed, kernel = "gaussian", 
    adaptive = FALSE, longlat = FALSE)

   Dependent (y) variable:  SELLING_PRICE
   Independent variables:  AREA_SQM AGE PROX_CBD PROX_CHILDCARE PROX_ELDERLYCARE PROX_URA_GROWTH_AREA PROX_MRT PROX_PARK PROX_PRIMARY_SCH PROX_SHOPPING_MALL PROX_BUS_STOP NO_Of_UNITS FAMILY_FRIENDLY FREEHOLD
   Number of data points: 1436
   ***********************************************************************
   *                    Results of Global Regression                     *
   ***********************************************************************

   Call:
    lm(formula = formula, data = data)

   Residuals:
     Min       1Q   Median       3Q      Max 
-3470778  -298119   -23481   248917 12234210 

   Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
   (Intercept)           527633.22  108183.22   4.877 1.20e-06 ***
   AREA_SQM               12777.52     367.48  34.771  < 2e-16 ***
   AGE                   -24687.74    2754.84  -8.962  < 2e-16 ***
   PROX_CBD              -77131.32    5763.12 -13.384  < 2e-16 ***
   PROX_CHILDCARE       -318472.75  107959.51  -2.950 0.003231 ** 
   PROX_ELDERLYCARE      185575.62   39901.86   4.651 3.61e-06 ***
   PROX_URA_GROWTH_AREA   39163.25   11754.83   3.332 0.000885 ***
   PROX_MRT             -294745.11   56916.37  -5.179 2.56e-07 ***
   PROX_PARK             570504.81   65507.03   8.709  < 2e-16 ***
   PROX_PRIMARY_SCH      159856.14   60234.60   2.654 0.008046 ** 
   PROX_SHOPPING_MALL   -220947.25   36561.83  -6.043 1.93e-09 ***
   PROX_BUS_STOP         682482.22  134513.24   5.074 4.42e-07 ***
   NO_Of_UNITS             -245.48      87.95  -2.791 0.005321 ** 
   FAMILY_FRIENDLY       146307.58   46893.02   3.120 0.001845 ** 
   FREEHOLD              350599.81   48506.48   7.228 7.98e-13 ***

   ---Significance stars
   Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
   Residual standard error: 756000 on 1421 degrees of freedom
   Multiple R-squared: 0.6507
   Adjusted R-squared: 0.6472 
   F-statistic: 189.1 on 14 and 1421 DF,  p-value: < 2.2e-16 
   ***Extra Diagnostic information
   Residual sum of squares: 8.120609e+14
   Sigma(hat): 752522.9
   AIC:  42966.76
   AICc:  42967.14
   BIC:  41731.39
   ***********************************************************************
   *          Results of Geographically Weighted Regression              *
   ***********************************************************************

   *********************Model calibration information*********************
   Kernel function: gaussian 
   Fixed bandwidth: 971.3405 
   Regression points: the same locations as observations are used.
   Distance metric: Euclidean distance metric is used.

   ****************Summary of GWR coefficient estimates:******************
                               Min.     1st Qu.      Median     3rd Qu.
   Intercept            -3.5988e+07 -5.1998e+05  7.6780e+05  1.7412e+06
   AREA_SQM              1.0003e+03  5.2758e+03  7.4740e+03  1.2301e+04
   AGE                  -1.3475e+05 -2.0813e+04 -8.6260e+03 -3.7784e+03
   PROX_CBD             -7.7047e+07 -2.3608e+05 -8.3600e+04  3.4646e+04
   PROX_CHILDCARE       -6.0097e+06 -3.3667e+05 -9.7425e+04  2.9007e+05
   PROX_ELDERLYCARE     -3.5000e+06 -1.5970e+05  3.1971e+04  1.9577e+05
   PROX_URA_GROWTH_AREA -3.0170e+06 -8.2013e+04  7.0749e+04  2.2612e+05
   PROX_MRT             -3.5282e+06 -6.5836e+05 -1.8833e+05  3.6922e+04
   PROX_PARK            -1.2062e+06 -2.1732e+05  3.5383e+04  4.1335e+05
   PROX_PRIMARY_SCH     -2.2695e+07 -1.7066e+05  4.8472e+04  5.1555e+05
   PROX_SHOPPING_MALL   -7.2585e+06 -1.6684e+05 -1.0517e+04  1.5923e+05
   PROX_BUS_STOP        -1.4676e+06 -4.5207e+04  3.7601e+05  1.1664e+06
   NO_Of_UNITS          -1.3170e+03 -2.4822e+02 -3.0846e+01  2.5496e+02
   FAMILY_FRIENDLY      -2.2749e+06 -1.1140e+05  7.6214e+03  1.6107e+05
   FREEHOLD             -9.2067e+06  3.8073e+04  1.5169e+05  3.7528e+05
                             Max.
   Intercept            112793548
   AREA_SQM                 21575
   AGE                     434201
   PROX_CBD               2704596
   PROX_CHILDCARE         1654087
   PROX_ELDERLYCARE      38867814
   PROX_URA_GROWTH_AREA  78515730
   PROX_MRT               3124316
   PROX_PARK             18122425
   PROX_PRIMARY_SCH       4637503
   PROX_SHOPPING_MALL     1529952
   PROX_BUS_STOP         11342182
   NO_Of_UNITS              12907
   FAMILY_FRIENDLY        1720744
   FREEHOLD               6073636
   ************************Diagnostic information*************************
   Number of data points: 1436 
   Effective number of parameters (2trace(S) - trace(S'S)): 438.3804 
   Effective degrees of freedom (n-2trace(S) + trace(S'S)): 997.6196 
   AICc (GWR book, Fotheringham, et al. 2002, p. 61, eq 2.33): 42263.61 
   AIC (GWR book, Fotheringham, et al. 2002,GWR p. 96, eq. 4.22): 41632.36 
   BIC (GWR book, Fotheringham, et al. 2002,GWR p. 61, eq. 2.34): 42515.71 
   Residual sum of squares: 2.53407e+14 
   R-square value:  0.8909912 
   Adjusted R-square value:  0.8430417 

   ***********************************************************************
   Program stops at: 2024-10-17 00:03:37.458359 

R-square improved form 0.64 to 0.84, which is very significant.

Build Adaptive Bandwidth GWR Model

Now, I will repeat the same but for adaptive bandwidth this time.

# Select optimal adaptive bandwidth
bw_adaptive <- bw.gwr(formula = SELLING_PRICE ~ AREA_SQM + AGE + 
                     PROX_CBD + PROX_CHILDCARE + 
                     PROX_ELDERLYCARE + PROX_URA_GROWTH_AREA + 
                     PROX_MRT + PROX_PARK + PROX_PRIMARY_SCH + 
                     PROX_SHOPPING_MALL + PROX_BUS_STOP + 
                     NO_Of_UNITS + FAMILY_FRIENDLY + FREEHOLD,
                   data = condo_resale_sf,
                   approach = "CV",
                   kernel="gaussian",
                   adaptive = TRUE,
                   longlat=FALSE)
Adaptive bandwidth: 895 CV score: 7.952401e+14 
Adaptive bandwidth: 561 CV score: 7.667364e+14 
Adaptive bandwidth: 354 CV score: 6.953454e+14 
Adaptive bandwidth: 226 CV score: 6.15223e+14 
Adaptive bandwidth: 147 CV score: 5.674373e+14 
Adaptive bandwidth: 98 CV score: 5.426745e+14 
Adaptive bandwidth: 68 CV score: 5.168117e+14 
Adaptive bandwidth: 49 CV score: 4.859631e+14 
Adaptive bandwidth: 37 CV score: 4.646518e+14 
Adaptive bandwidth: 30 CV score: 4.422088e+14 
Adaptive bandwidth: 25 CV score: 4.430816e+14 
Adaptive bandwidth: 32 CV score: 4.505602e+14 
Adaptive bandwidth: 27 CV score: 4.462172e+14 
Adaptive bandwidth: 30 CV score: 4.422088e+14 
gwr_adaptive <- gwr.basic(formula = SELLING_PRICE ~ AREA_SQM + AGE + 
                     PROX_CBD + PROX_CHILDCARE + 
                     PROX_ELDERLYCARE + PROX_URA_GROWTH_AREA + 
                     PROX_MRT + PROX_PARK + PROX_PRIMARY_SCH + 
                     PROX_SHOPPING_MALL + PROX_BUS_STOP + 
                     NO_Of_UNITS + FAMILY_FRIENDLY + FREEHOLD,
                   data = condo_resale_sf,
                   bw = bw_adaptive,
                   kernel="gaussian",
                   adaptive = TRUE,
                   longlat=FALSE)

# Check results
gwr_adaptive
   ***********************************************************************
   *                       Package   GWmodel                             *
   ***********************************************************************
   Program starts at: 2024-10-17 00:03:44.40474 
   Call:
   gwr.basic(formula = SELLING_PRICE ~ AREA_SQM + AGE + PROX_CBD + 
    PROX_CHILDCARE + PROX_ELDERLYCARE + PROX_URA_GROWTH_AREA + 
    PROX_MRT + PROX_PARK + PROX_PRIMARY_SCH + PROX_SHOPPING_MALL + 
    PROX_BUS_STOP + NO_Of_UNITS + FAMILY_FRIENDLY + FREEHOLD, 
    data = condo_resale_sf, bw = bw_adaptive, kernel = "gaussian", 
    adaptive = TRUE, longlat = FALSE)

   Dependent (y) variable:  SELLING_PRICE
   Independent variables:  AREA_SQM AGE PROX_CBD PROX_CHILDCARE PROX_ELDERLYCARE PROX_URA_GROWTH_AREA PROX_MRT PROX_PARK PROX_PRIMARY_SCH PROX_SHOPPING_MALL PROX_BUS_STOP NO_Of_UNITS FAMILY_FRIENDLY FREEHOLD
   Number of data points: 1436
   ***********************************************************************
   *                    Results of Global Regression                     *
   ***********************************************************************

   Call:
    lm(formula = formula, data = data)

   Residuals:
     Min       1Q   Median       3Q      Max 
-3470778  -298119   -23481   248917 12234210 

   Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
   (Intercept)           527633.22  108183.22   4.877 1.20e-06 ***
   AREA_SQM               12777.52     367.48  34.771  < 2e-16 ***
   AGE                   -24687.74    2754.84  -8.962  < 2e-16 ***
   PROX_CBD              -77131.32    5763.12 -13.384  < 2e-16 ***
   PROX_CHILDCARE       -318472.75  107959.51  -2.950 0.003231 ** 
   PROX_ELDERLYCARE      185575.62   39901.86   4.651 3.61e-06 ***
   PROX_URA_GROWTH_AREA   39163.25   11754.83   3.332 0.000885 ***
   PROX_MRT             -294745.11   56916.37  -5.179 2.56e-07 ***
   PROX_PARK             570504.81   65507.03   8.709  < 2e-16 ***
   PROX_PRIMARY_SCH      159856.14   60234.60   2.654 0.008046 ** 
   PROX_SHOPPING_MALL   -220947.25   36561.83  -6.043 1.93e-09 ***
   PROX_BUS_STOP         682482.22  134513.24   5.074 4.42e-07 ***
   NO_Of_UNITS             -245.48      87.95  -2.791 0.005321 ** 
   FAMILY_FRIENDLY       146307.58   46893.02   3.120 0.001845 ** 
   FREEHOLD              350599.81   48506.48   7.228 7.98e-13 ***

   ---Significance stars
   Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
   Residual standard error: 756000 on 1421 degrees of freedom
   Multiple R-squared: 0.6507
   Adjusted R-squared: 0.6472 
   F-statistic: 189.1 on 14 and 1421 DF,  p-value: < 2.2e-16 
   ***Extra Diagnostic information
   Residual sum of squares: 8.120609e+14
   Sigma(hat): 752522.9
   AIC:  42966.76
   AICc:  42967.14
   BIC:  41731.39
   ***********************************************************************
   *          Results of Geographically Weighted Regression              *
   ***********************************************************************

   *********************Model calibration information*********************
   Kernel function: gaussian 
   Adaptive bandwidth: 30 (number of nearest neighbours)
   Regression points: the same locations as observations are used.
   Distance metric: Euclidean distance metric is used.

   ****************Summary of GWR coefficient estimates:******************
                               Min.     1st Qu.      Median     3rd Qu.
   Intercept            -1.3487e+08 -2.4669e+05  7.7928e+05  1.6194e+06
   AREA_SQM              3.3188e+03  5.6285e+03  7.7825e+03  1.2738e+04
   AGE                  -9.6746e+04 -2.9288e+04 -1.4043e+04 -5.6119e+03
   PROX_CBD             -2.5330e+06 -1.6256e+05 -7.7242e+04  2.6624e+03
   PROX_CHILDCARE       -1.2790e+06 -2.0175e+05  8.7158e+03  3.7778e+05
   PROX_ELDERLYCARE     -1.6212e+06 -9.2050e+04  6.1029e+04  2.8184e+05
   PROX_URA_GROWTH_AREA -7.2686e+06 -3.0350e+04  4.5869e+04  2.4613e+05
   PROX_MRT             -4.3781e+07 -6.7282e+05 -2.2115e+05 -7.4593e+04
   PROX_PARK            -2.9020e+06 -1.6782e+05  1.1601e+05  4.6572e+05
   PROX_PRIMARY_SCH     -8.6418e+05 -1.6627e+05 -7.7853e+03  4.3222e+05
   PROX_SHOPPING_MALL   -1.8272e+06 -1.3175e+05 -1.4049e+04  1.3799e+05
   PROX_BUS_STOP        -2.0579e+06 -7.1461e+04  4.1104e+05  1.2071e+06
   NO_Of_UNITS          -2.1993e+03 -2.3685e+02 -3.4699e+01  1.1657e+02
   FAMILY_FRIENDLY      -5.9879e+05 -5.0927e+04  2.6173e+04  2.2481e+05
   FREEHOLD             -1.6340e+05  4.0765e+04  1.9023e+05  3.7960e+05
                            Max.
   Intercept            18758355
   AREA_SQM                23064
   AGE                     13303
   PROX_CBD             11346650
   PROX_CHILDCARE        2892127
   PROX_ELDERLYCARE      2465671
   PROX_URA_GROWTH_AREA  7384059
   PROX_MRT              1186242
   PROX_PARK             2588497
   PROX_PRIMARY_SCH      3381462
   PROX_SHOPPING_MALL   38038564
   PROX_BUS_STOP        12081592
   NO_Of_UNITS              1010
   FAMILY_FRIENDLY       2072414
   FREEHOLD              1813995
   ************************Diagnostic information*************************
   Number of data points: 1436 
   Effective number of parameters (2trace(S) - trace(S'S)): 350.3088 
   Effective degrees of freedom (n-2trace(S) + trace(S'S)): 1085.691 
   AICc (GWR book, Fotheringham, et al. 2002, p. 61, eq 2.33): 41982.22 
   AIC (GWR book, Fotheringham, et al. 2002,GWR p. 96, eq. 4.22): 41546.74 
   BIC (GWR book, Fotheringham, et al. 2002,GWR p. 61, eq. 2.34): 41914.08 
   Residual sum of squares: 2.528227e+14 
   R-square value:  0.8912425 
   Adjusted R-square value:  0.8561185 

   ***********************************************************************
   Program stops at: 2024-10-17 00:03:45.483133 

Visualizing GWR Output

As mentioned earlier, the output feature class table contains many useful information

  • Condition Number: this diagnostic evaluates local collinearity. In the presence of strong local collinearity, results become unstable. Results associated with condition numbers larger than 30, may be unreliable.
  • Local R2: these values range between 0.0 and 1.0 and indicate how well the local regression model fits observed y values. Very low values indicate the local model is performing poorly. Mapping the Local R2 values to see where GWR predicts well and where it predicts poorly may provide clues about important variables that may be missing from the regression model.
  • Predicted: these are the estimated (or fitted) y values 3. computed by GWR.
  • Residuals: to obtain the residual values, the fitted y values are subtracted from the observed y values. Standardized residuals have a mean of zero and a standard deviation of 1. A cold-to-hot rendered map of standardized residuals can be produce by using these values.
  • Coefficient Standard Error: these values measure the reliability of each coefficient estimate. Confidence in those estimates are higher when standard errors are small in relation to the actual coefficient values. Large standard errors may indicate problems with local collinearity.

To visualize the fields, I will first need to convert it into a sf dataframe.

# Select
gwr_adaptive_output <- as.data.frame(gwr_adaptive$SDF) %>%
  select(-c(2:15)) # To remove redundant variables

Then i will column bind it to the sf dataframe. glimpse() is then used to check the contents.

# Combine dataframes
gwr_sf_adaptive <- cbind(condo_resale_sf,
                      gwr_adaptive_output)

# Check result
glimpse(gwr_sf_adaptive)
Rows: 1,436
Columns: 63
$ nb                      <nb> <66, 77, 123, 238, 239, 343>, <21, 162, 163, 19…
$ wt                      <list> <0.1666667, 0.1666667, 0.1666667, 0.1666667, …
$ POSTCODE                <dbl> 118635, 288420, 267833, 258380, 467169, 466472…
$ SELLING_PRICE           <dbl> 3000000, 3880000, 3325000, 4250000, 1400000, 1…
$ AREA_SQM                <dbl> 309, 290, 248, 127, 145, 139, 218, 141, 165, 1…
$ AGE                     <dbl> 30, 32, 33, 7, 28, 22, 24, 24, 27, 31, 17, 22,…
$ PROX_CBD                <dbl> 7.941259, 6.609797, 6.898000, 4.038861, 11.783…
$ PROX_CHILDCARE          <dbl> 0.16597932, 0.28027246, 0.42922669, 0.39473543…
$ PROX_ELDERLYCARE        <dbl> 2.5198118, 1.9333338, 0.5021395, 1.9910316, 1.…
$ PROX_URA_GROWTH_AREA    <dbl> 6.618741, 7.505109, 6.463887, 4.906512, 6.4106…
$ PROX_HAWKER_MARKET      <dbl> 1.76542207, 0.54507614, 0.37789301, 1.68259969…
$ PROX_KINDERGARTEN       <dbl> 0.05835552, 0.61592412, 0.14120309, 0.38200076…
$ PROX_MRT                <dbl> 0.5607188, 0.6584461, 0.3053433, 0.6910183, 0.…
$ PROX_PARK               <dbl> 1.1710446, 0.1992269, 0.2779886, 0.9832843, 0.…
$ PROX_PRIMARY_SCH        <dbl> 1.6340256, 0.9747834, 1.4715016, 1.4546324, 0.…
$ PROX_TOP_PRIMARY_SCH    <dbl> 3.3273195, 0.9747834, 1.4715016, 2.3006394, 0.…
$ PROX_SHOPPING_MALL      <dbl> 2.2102717, 2.9374279, 1.2256850, 0.3525671, 1.…
$ PROX_SUPERMARKET        <dbl> 0.9103958, 0.5900617, 0.4135583, 0.4162219, 0.…
$ PROX_BUS_STOP           <dbl> 0.10336166, 0.28673408, 0.28504777, 0.29872340…
$ NO_Of_UNITS             <dbl> 18, 20, 27, 30, 30, 31, 32, 32, 32, 32, 34, 34…
$ FAMILY_FRIENDLY         <dbl> 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0…
$ FREEHOLD                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1…
$ LEASEHOLD_99YR          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ MLR_RES                 <dbl> -1489099.55, 415494.57, 194129.69, 1088992.71,…
$ Intercept               <dbl> 2050011.67, 1633128.24, 3433608.17, 234358.91,…
$ y                       <dbl> 3000000, 3880000, 3325000, 4250000, 1400000, 1…
$ yhat                    <dbl> 2886531.8, 3466801.5, 3616527.2, 5435481.6, 13…
$ residual                <dbl> 113468.16, 413198.52, -291527.20, -1185481.63,…
$ CV_Score                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ Stud_residual           <dbl> 0.38207013, 1.01433140, -0.83780678, -2.846146…
$ Intercept_SE            <dbl> 516105.5, 488083.5, 963711.4, 444185.5, 211962…
$ AREA_SQM_SE             <dbl> 823.2860, 825.2380, 988.2240, 617.4007, 1376.2…
$ AGE_SE                  <dbl> 5889.782, 6226.916, 6510.236, 6010.511, 8180.3…
$ PROX_CBD_SE             <dbl> 37411.22, 23615.06, 56103.77, 469337.41, 41064…
$ PROX_CHILDCARE_SE       <dbl> 319111.1, 299705.3, 349128.5, 304965.2, 698720…
$ PROX_ELDERLYCARE_SE     <dbl> 120633.34, 84546.69, 129687.07, 127150.69, 327…
$ PROX_URA_GROWTH_AREA_SE <dbl> 56207.39, 76956.50, 95774.60, 470762.12, 47433…
$ PROX_MRT_SE             <dbl> 185181.3, 281133.9, 275483.7, 279877.1, 363830…
$ PROX_PARK_SE            <dbl> 205499.6, 229358.7, 314124.3, 227249.4, 364580…
$ PROX_PRIMARY_SCH_SE     <dbl> 152400.7, 165150.7, 196662.6, 240878.9, 249087…
$ PROX_SHOPPING_MALL_SE   <dbl> 109268.8, 98906.8, 119913.3, 177104.1, 301032.…
$ PROX_BUS_STOP_SE        <dbl> 600668.6, 410222.1, 464156.7, 562810.8, 740922…
$ NO_Of_UNITS_SE          <dbl> 218.1258, 208.9410, 210.9828, 361.7767, 299.50…
$ FAMILY_FRIENDLY_SE      <dbl> 131474.73, 114989.07, 146607.22, 108726.62, 16…
$ FREEHOLD_SE             <dbl> 115954.0, 130110.0, 141031.5, 138239.1, 210641…
$ Intercept_TV            <dbl> 3.9720784, 3.3460017, 3.5629010, 0.5276150, 1.…
$ AREA_SQM_TV             <dbl> 11.614302, 20.087361, 13.247868, 33.577223, 4.…
$ AGE_TV                  <dbl> -1.6154474, -9.3441881, -4.1023685, -15.524301…
$ PROX_CBD_TV             <dbl> -3.22582173, -6.32792021, -4.62353528, 5.17080…
$ PROX_CHILDCARE_TV       <dbl> 1.000488185, 1.471786337, -0.344047555, 1.5766…
$ PROX_ELDERLYCARE_TV     <dbl> -3.26126929, 3.84626245, 4.13191383, 2.4756745…
$ PROX_URA_GROWTH_AREA_TV <dbl> -2.846248368, -1.848971738, -2.648105057, -5.6…
$ PROX_MRT_TV             <dbl> -1.61864578, -8.92998600, -3.40075727, -7.2870…
$ PROX_PARK_TV            <dbl> -0.83749312, 2.28192684, 0.66565951, -3.340617…
$ PROX_PRIMARY_SCH_TV     <dbl> 1.59230221, 6.70194543, 2.90580089, 12.9836104…
$ PROX_SHOPPING_MALL_TV   <dbl> 2.753588422, -0.886626400, -1.056869486, -0.16…
$ PROX_BUS_STOP_TV        <dbl> 2.0154464, 4.4941192, 3.0419145, 12.8383775, 0…
$ NO_Of_UNITS_TV          <dbl> 0.480589953, -1.380026395, -0.045279967, -0.44…
$ FAMILY_FRIENDLY_TV      <dbl> -0.06902748, 2.69655779, 0.04058290, 14.312764…
$ FREEHOLD_TV             <dbl> 2.6213469, 3.0452799, 1.1970499, 8.7711485, 1.…
$ Local_R2                <dbl> 0.8846744, 0.8899773, 0.8947007, 0.9073605, 0.…
$ geometry                <POINT [m]> POINT (22085.12 29951.54), POINT (25656.…
$ geometry.1              <POINT [m]> POINT (22085.12 29951.54), POINT (25656.…

summary() is also useful to check the predicted values.

# Check yhat
summary(gwr_sf_adaptive$yhat)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
  171347  1102001  1385528  1751842  1982307 13887901 

To visualise the local R2, i will use the tmap package.

# Plot R2
tm_shape(mpsz)+
  tm_polygons()+
tm_shape(gwr_sf_adaptive) + 
  tm_bubbles(col = "Local_R2",
           size = 0.15,
           border.col = "gray60",
           border.lwd = 1)
Warning: The shape mpsz is invalid. See sf::st_is_valid

Now i will try to plot them side by side.

# Visualize side by side
tmap_options(check.and.fix = TRUE)
# Set to interactive plot
tmap_mode("view")
tmap mode set to interactive viewing
# Plot AREA_SQM_SE coefficients
AREA_SQM_SE <- tm_shape(mpsz)+
  tm_polygons(alpha = 0.1) +
tm_shape(gwr_sf_adaptive) +  
  tm_dots(col = "AREA_SQM_SE",
          border.col = "gray60",
          border.lwd = 1) +
  tm_view(set.zoom.limits = c(11,14))

# Plot AREA_SQM_TV coefficients
AREA_SQM_TV <- tm_shape(mpsz)+
  tm_polygons(alpha = 0.1) +
tm_shape(gwr_sf_adaptive) +  
  tm_dots(col = "AREA_SQM_TV",
          border.col = "gray60",
          border.lwd = 1) +
  tm_view(set.zoom.limits = c(11,14))

# Show them side by side
tmap_arrange(AREA_SQM_SE, AREA_SQM_TV, 
             asp=1, ncol=2,
             sync = TRUE)
Warning: The shape mpsz is invalid (after reprojection). See sf::st_is_valid
Warning: The shape mpsz is invalid (after reprojection). See sf::st_is_valid
# Change tmap back to plot
tmap_mode("plot")
tmap mode set to plotting

This can also be done by region

# View central region
tm_shape(mpsz[mpsz$REGION_N=="CENTRAL REGION", ])+
  tm_polygons()+
tm_shape(gwr_sf_adaptive) + 
  tm_bubbles(col = "Local_R2",
           size = 0.15,
           border.col = "gray60",
           border.lwd = 1)
Warning: The shape mpsz[mpsz$REGION_N == "CENTRAL REGION", ] is invalid. See
sf::st_is_valid